import pandas as pd
import numpy as np
import requests
import tweepy
import os
from functools import reduce
from io import StringIO
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Gathering three different datasets using different methods including request , IO and Tweetpy as well as json
# Gather arhcive twitter data
df_twitter = pd.read_csv(r'C:\Users\ADMIN\Desktop\data wrangling\Project 2\twitter-archive-enhanced.csv')
df_twitter.head(5)
df_twitter.shape
#Gather image data from link using request libray
image_link = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
image_load = requests.get(image_link)
#View the image_load status
image_load
# make image_load to encoded utf-8 string as standard practice
images = str(image_load.content, 'UTF-8')
image_str = StringIO(images)
#Load image dataset
df_images = pd.read_csv(image_str ,sep = '\t')
#Checking column names
df_images.columns
#Check numbe of columns
df_images.shape
#loading the json file and reading each line
df_api = []
with open('tweet_json.txt') as jsfile:
for line in jsfile:
tweet = (json.loads(line))
tweet_id = tweet['id']
retweet_count = tweet['retweet_count']
favourite_count = tweet['favorite_count']
df_api.append({'tweet_id': tweet_id,
'favorite_count': favourite_count,
'retweet_count': retweet_count })
# Putting the list into a dataframe
retweet_data = pd.DataFrame(df_api , columns = ['tweet_id' , 'retweet_count','favorite_count'])
#Checking columns and rows
retweet_data.shape
#Checking column names
retweet_data.columns
During this stage we will be addresing two things
#Asssessing first dataset
df_twitter.head(5)
#Sample 5 different rows
df_twitter.sample(5)
#assesing the source column
df_twitter.source.value_counts().sum()
#checking datatypes
df_twitter.info()
#function for calculating the number and percentage of number of nulls for a column
def checknumofnulls(column , string):
numofnulls = column.isna().sum()
totalnumofvalues = column.value_counts().sum()
percentage = numofnulls/totalnumofvalues*100
string_text = string
if percentage == 0:
print("No null value in"+column)
return string_text , numofnulls , percentage.round()
#checking nulls in expanded urls column
checknumofnulls(df_twitter.expanded_urls , f"the number of nulls and their percentage in {df_twitter.columns[5]} is")
#checking nulls in text column
checknumofnulls(df_twitter.text , f"the number of nulls and their percentage in {df_twitter.columns[9]} is")
#checking nulls in tweet_id column
checknumofnulls(df_twitter.name, f"the number of nulls and their percentage in {df_twitter.columns[12]} is")
#check first 5 rows of second dataset
df_images.head()
#Check numbe of columns
df_images.shape
#check dtypes
df_images.info()
#checking nulls in jpg_url
checknumofnulls(df_images.jpg_url , f"the number of nulls and their percentage in {df_images.columns[1]} is")
#checking nulls in p1
checknumofnulls(df_images.p1 , f"the number of nulls and their percentage in {df_images.columns[3]} is")
#checking nulls in p2
checknumofnulls(df_images.p2 , f"the number of nulls and their percentage in {df_twitter.columns[6]} is")
#checking nulls in p3
checknumofnulls(df_images.p3 , f"the number of nulls and percentage in column {df_images.columns[9]} is")
# checking for false values in p1_dog
df_images.p1_dog.value_counts()
# function checking percentage of true to false values
def percent_dog(column , string):
totalcolumn = column.value_counts().sum()
true = column.values.sum()
false = (~column).values.sum()
truepercent = true/totalcolumn*100
falsepercent = false/totalcolumn*100
text = string
return text , truepercent.round(), falsepercent.round()
percent_dog(df_images.p1_dog , f"the percentage of true and false values in {df_images.columns[5]} is")
percent_dog(df_images.p2_dog , f"the percentage of true and false values in {df_images.columns[8]} is")
percent_dog(df_images.p3_dog , f"the percentage of true and false values in {df_images.columns[11]} is")
#checking first five rows
retweet_data.head()
#random five samples for visual assessment
retweet_data.sample(5)
#Checkin info about datasets
retweet_data.info()
# First Make copies or Origina dataframes
df_twitter_clean = df_twitter.copy()
df_images_clean = df_images.copy()
retweet_data_clean = retweet_data.copy()
#Check for differnt types of values in source column
df_twitter_clean.source.value_counts()
#Replace the links with more precise source names using map function
df_twitter_clean.source = df_twitter_clean.source.map({'<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>':'Twitter for iphone',
'<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>': 'Vine' ,
'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>': 'Twitter Web Client',
'<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>': 'TweetDeck'})
df_twitter_clean['source'].value_counts()
df_twitter_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis = 1, inplace = True)
df_twitter_clean.drop('expanded_urls', axis = 1, inplace = True)
df_twitter_clean.info()
df_twitter_clean['timestamp'] = pd.to_datetime(df_twitter_clean['timestamp'])
df_twitter_clean.timestamp.dtype
df_twitter_clean['tweet_id'] = df_twitter_clean['tweet_id'].astype('string')
df_images_clean['tweet_id'] = (df_images_clean['tweet_id'].astype('string'))
retweet_data_clean['tweet_id'] = (retweet_data_clean['tweet_id'].astype('string'))
df_twitter_clean.tweet_id.dtype
df_images_clean.tweet_id.dtype
retweet_data_clean.tweet_id.dtype
df_twitter_clean.doggo = df_twitter_clean.doggo.astype('category')
df_twitter_clean.floofer = df_twitter_clean.floofer.astype('category')
df_twitter_clean.pupper = df_twitter_clean.pupper.astype('category')
df_twitter_clean.puppo = df_twitter_clean.puppo.astype('category')
df_twitter_clean.doggo.dtype
df_twitter_clean.floofer.dtype
df_twitter_clean.pupper.dtype
df_twitter_clean.puppo.dtype
df_twitter_clean.name.value_counts()
df_twitter_clean.name = df_twitter_clean.name.replace('None' , 'No Name')
df_twitter_clean.name.value_counts()
Define The a , an name is not normal for a dog so they must be filtered out
df_twitter_clean = df_twitter_clean[df_twitter_clean['name'] != 'the']
df_twitter_clean = df_twitter_clean[df_twitter_clean['name'] != 'a']
df_twitter_clean = df_twitter_clean[df_twitter_clean['name'] != 'an']
assert "the" not in df_twitter_clean.name
assert "a" not in df_twitter_clean.name
assert "an" not in df_twitter_clean.name
df_twitter_clean = df_twitter_clean[~df_twitter_clean.text.str.startswith('RT @')]
assert "RT @" not in df_twitter_clean.text
df_twitter_clean.rating_numerator.value_counts
#Removing values with zero numerator ratings
df_twitter_clean = df_twitter_clean[df_twitter_clean.rating_numerator != 0]
#Testing
df_twitter_clean.query('rating_numerator == 0')
#checking for zeros in rating denominator
df_twitter_clean.query('rating_denominator == 0')
#Removing values with zero denominator ratings
df_twitter_clean = df_twitter_clean[df_twitter_clean.rating_denominator != 0]
#checking for zeros in rating denominator
df_twitter_clean.query('rating_denominator == 0')
#Converting the number to string so we can keep the actual ratings.
#otherwise as an int , they would be divided to give us a float in return
df_twitter_clean.rating_numerator = df_twitter_clean.rating_numerator.astype('string')
df_twitter_clean.rating_denominator = df_twitter_clean.rating_denominator.astype('string')
# Making the two columns into one using apply function
df_twitter_clean['rating'] = df_twitter_clean.apply(lambda row: row.rating_numerator + "/"+ row.rating_denominator, axis=1)
df_twitter_clean.rating.value_counts()
# Now we need to drop the rating_numerator and rating_denominator columns
df_twitter_clean.drop(['rating_numerator','rating_denominator'] , axis = 1, inplace = True)
#test if column is removed
df_twitter_clean.info()
#make new column called dog_stage
df_twitter_clean['dog_stage'] = df_twitter_clean.apply(lambda row: row.doggo + row.floofer + row.pupper + row.puppo, axis=1)
#check column contents
df_twitter_clean.dog_stage.value_counts()
# Assign clear dog stages for column
df_twitter_clean['dog_stage'] = df_twitter_clean.dog_stage.map({'NoneNoneNoneNone' : 'unknown',
'NoneNonepupperNone' : 'pupper',
'doggoNoneNoneNone' : 'doggo' ,
'NoneNoneNonepuppo' : 'puppo',
'doggoNonepupperNone': 'doggo pupper',
'NoneflooferNoneNone' : 'floofer' ,
'doggoNoneNonepuppo' : 'doggo puppo' ,
'doggoflooferNoneNone' : 'doggo floofer'})
#View if column is created successfully
df_twitter_clean.dog_stage.value_counts()
df_twitter_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis = 1, inplace = True)
df_twitter_clean.info()
from functools import reduce
tweet_master = reduce(lambda x,y: pd.merge(x,y, on=['tweet_id'], how='inner'),
[df_twitter_clean, df_images_clean, retweet_data_clean])
tweet_master.shape
tweet_master.columns
The master dataset will be saved as csv file
tweet_master.to_csv('twitter_archive_master.csv', index=False)
tweet_master.head(3)
# Plotting ratings count with highest ratings
tweet_master['rating'].value_counts()[:35].plot(kind='bar' , figsize=(10,10),
ylabel = 'Ratings Count', xlabel = 'Ratings',
title = "Rating Value Counts")
plt.savefig('Ratings_value_counts.png')
tweet_master['dog_stage'].value_counts()[:10].plot(kind='barh' , figsize=(10,8),
ylabel = 'Dog Stages ', xlabel = 'Dog Stages Count',
title = "Number of dogs in each stage",
color = "lightblue")
plt.savefig('dog_stage_count_values.png')
fig, ax = plt.subplots(figsize =(8,6), dpi =600)
chart = sns.lineplot(x= 'retweet_count', y = 'favorite_count', data = tweet_master)
chart.set(title = ' Correlation Between Retweet Count and Favorite Count')
plt.show()
fig.savefig('correlation_Retweet_x_Favourite_count.png')
sns.catplot(x="dog_stage", y="p1_conf", kind="strip", data=tweet_master
,height=8, aspect=10/8)
plt.title("The first prediction confidence percentage per dog stage")
sns.set(font_scale = 1.5)
plt.savefig('1st_pred_per_dog_percentage.png')
sns.catplot(x="dog_stage", y="p2_conf", kind="strip", data=tweet_master
,height=8, aspect=10/8)
plt.title("The second prediction confidence percenetage per dog stage" , fontsize = 20)
plt.savefig('2nd_pred_per_dog_percentage.png')
sns.catplot(x="dog_stage", y="p3_conf", kind="strip", data=tweet_master
,height=8, aspect=10/8)
plt.savefig('3rd_pred_per_dog_percentage.png')
tweet_master.p1_dog.value_counts().plot(kind='barh' ,title= 'True vs false Values in 1st prediction')
plt.savefig('true_v_false_1st.png')
tweet_master.p2_dog.value_counts().plot(kind='barh' ,title= 'True vs false Values in 2nd prediction')
plt.savefig('true_v_false_2nd.png')
tweet_master.p3_dog.value_counts().plot(kind='barh' ,title= 'True vs false Values in 2nd prediction')
plt.savefig('true_v_false_3rd.png')
#correlation between rating and favorite count
fig, ax = plt.subplots(figsize =(30,30), dpi =600)
chart = sns.lineplot(x= 'rating', y = 'favorite_count',data = tweet_master)
chart.set(title = ' Correlation Between Rating and Favorite Count')
sns.color_palette("bright")
sns.set(font_scale = 1.5)
plt.xticks(rotation =45 , fontsize= 20)
plt.yticks(fontsize = 20)
plt.show()
plt.savefig('correlation_rating_favorite.png')
#Check percentage of no name values in names column
non_name_percentage = len(tweet_master.query('name == "No Name"'))/tweet_master.name.shape[0]*100
print(f"The percentage number of dogs with no names {non_name_percentage}%")
#check the percentage of unkwown dog stage names
unkwown_percentage = len(tweet_master.query('dog_stage == "unknown"'))/tweet_master.dog_stage.shape[0]*100
print(f"The percentage number of unknown dog stages is {unkwown_percentage}%")
tweet_master.groupby(tweet_master["timestamp"].dt.month)["retweet_count"].mean().plot(
kind='bar')
plt.title("Months and their retweet count ")
plt.xlabel("Months"); # custom x label using matplotlib
plt.ylabel("rewteet_count")
plt.savefig('rewteet_count_in_Months.png')
tweet_master.groupby(tweet_master["timestamp"].dt.year)["rating"].value_counts().plot(
kind='bar',figsize= (30,25))
plt.title("Every rating value in each year ")
plt.xlabel("years"); # custom x label using matplotlib
plt.xticks(rotation = 45)
plt.ylabel("ratings value counts")
plt.savefig('Rating_count_by_Year.png')
tweet_master.groupby(tweet_master["timestamp"].dt.month)["favorite_count"].mean().plot(
kind='bar')
plt.title("Favorite tweet count in Months")
plt.xlabel("Months"); # custom x label using matplotlib
plt.ylabel("favorite_count")
plt.savefig('favorite_tweet_count_in_months.png')